Use native support for atomic FADD when address is in LDS (#1591)

2019-11-22 02:23:48 +00:00
commit 2ed3a0873c
@@ -55,7 +55,7 @@ unsigned long long atomicAdd(
 }
 __device__
 inline
-float atomicAdd(float* address, float val)
+float atomicAdd_impl(float* address, float val)
 {
    unsigned int* uaddr{reinterpret_cast<unsigned int*>(address)};
    unsigned int r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
@@ -73,6 +73,37 @@ float atomicAdd(float* address, float val)

    return __uint_as_float(r);
 }
+#if !__has_builtin(__builtin_amdgcn_is_shared)
+    __device__
+    inline
+    bool __builtin_amdgcn_is_shared(
+        const __attribute__((address_space(0))) void* ptr) noexcept
+    {
+        #if defined(__HIP_DEVICE_COMPILE__)
+            const unsigned int gp = reinterpret_cast<unsigned long long>(ptr);
+
+            return gp ==
+                (__builtin_amdgcn_s_getreg((15 << 11) | (16 << 6) | 15) << 16);
+        #else
+            return false;
+        #endif
+    }
+#endif
+__device__
+inline
+float atomicAdd(float* address, float val)
+{
+    using GP = const __attribute__((address_space(0))) void*;
+    using LP = __attribute__((address_space(3))) float*;
+
+    #if __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
+        if (__builtin_amdgcn_is_shared((GP) address)) {
+            return __builtin_amdgcn_ds_faddf((LP) address, val, 0, 0, false);
+        }
+    #endif
+
+    return atomicAdd_impl(address, val);
+}
 __device__
 inline
 double atomicAdd(double* address, double val)
@@ -29,6 +29,7 @@ THE SOFTWARE.

 // includes, system
 #include <algorithm>
+#include <cstring>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -212,7 +213,7 @@ __device__
 void testKernelSub(...) {}

 template<
-    typename T, 
+    typename T,
    typename enable_if<
        is_same<T, int>{} || is_same<T, unsigned int>{}>::type* = nullptr>
 __device__
@@ -259,13 +260,37 @@ void testKernelIntegral(T* g_odata) {
    testKernelSub(g_odata);
 }

+namespace {
+    constexpr unsigned int numData = 11;
+}
+
 template<typename T>
-__global__ void testKernel(T* g_odata) {
+__global__ void testKernel(T* g_odata, T* g_sdata) {
    // Atomic addition
    atomicAdd(&g_odata[0], 10);

    testKernelIntegral(g_odata);
    testKernelExch(g_odata);
+
+    #if !defined(HIP_PLATFORM_NVCC)
+        // Shared Atomic addition.
+        __shared__ T s_odata[numData];
+
+        if (threadIdx.x == 0) { s_odata[8] = s_odata[10] = 0xff; }
+
+        __syncthreads();
+
+        atomicAdd(&s_odata[0], 10);
+
+        testKernelIntegral(s_odata);
+        testKernelExch(s_odata);
+
+        __syncthreads();
+
+        if (threadIdx.x == 0) {
+            __builtin_memcpy(g_sdata, s_odata, sizeof(T) * numData);
+        }
+    #endif
 }

 template<typename T>
@@ -286,33 +311,38 @@ void runTest() {

    unsigned int numThreads = 256;
    unsigned int numBlocks = 64;
-    unsigned int numData = 11;
    unsigned int memSize = sizeof(T) * numData;

    // allocate mem for the result on host side
    T* hOData = (T*)malloc(memSize);
+    T* hSData = (T*)malloc(memSize);

    // initialize the memory
-    for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
+    std::memset(hOData, 0, memSize);
+    std::memset(hSData, 0, memSize);

    // To make the AND and XOR tests generate something other than 0...
    hOData[8] = hOData[10] = 0xff;

    // allocate device memory for result
    T* dOData;
+    T* dSData;
    hipMalloc((void**)&dOData, memSize);
+    hipMalloc((void**)&dSData, memSize);
    // copy host memory to device to initialize to zero
    hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice);

    // execute the kernel
    hipLaunchKernelGGL(
-        testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
+        testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData, dSData);

    // Copy result from device to host
    hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost);
+    hipMemcpy(hSData, dSData, memSize, hipMemcpyDeviceToHost);

    // Compute reference solution
-    testResult = computeGold(hOData, numThreads * numBlocks);
+    testResult = std::equal(hOData, hOData + numData, hSData) &&
+                 computeGold(hOData, numThreads * numBlocks);

    // Cleanup memory
    free(hOData);