From 2ed3a0873cbcfa1e2b7ac3a83ccc3aa31154fcda Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Fri, 22 Nov 2019 02:23:48 +0000
Subject: [PATCH] Use native support for atomic FADD when address is in LDS
 (#1591)

---
 hipamd/include/hip/hcc_detail/hip_atomic.h    | 33 ++++++++++++++-
 .../src/deviceLib/hipSimpleAtomicsTest.cpp    | 42 ++++++++++++++++---
 2 files changed, 68 insertions(+), 7 deletions(-)
diff --git a/hipamd/include/hip/hcc_detail/hip_atomic.h b/hipamd/include/hip/hcc_detail/hip_atomic.h
index 263f639e96..c8dcf02322 100644
--- a/hipamd/include/hip/hcc_detail/hip_atomic.h
+++ b/hipamd/include/hip/hcc_detail/hip_atomic.h
@@ -55,7 +55,7 @@ unsigned long long atomicAdd(
 }
 __device__
 inline
-float atomicAdd(float* address, float val)
+float atomicAdd_impl(float* address, float val)
 {
     unsigned int* uaddr{reinterpret_cast<unsigned int*>(address)};
     unsigned int r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
@@ -73,6 +73,37 @@ float atomicAdd(float* address, float val)
 
     return __uint_as_float(r);
 }
+#if !__has_builtin(__builtin_amdgcn_is_shared)
+    __device__
+    inline
+    bool __builtin_amdgcn_is_shared(
+        const __attribute__((address_space(0))) void* ptr) noexcept
+    {
+        #if defined(__HIP_DEVICE_COMPILE__)
+            const unsigned int gp = reinterpret_cast<unsigned long long>(ptr);
+
+            return gp ==
+                (__builtin_amdgcn_s_getreg((15 << 11) | (16 << 6) | 15) << 16);
+        #else
+            return false;
+        #endif
+    }
+#endif
+__device__
+inline
+float atomicAdd(float* address, float val)
+{
+    using GP = const __attribute__((address_space(0))) void*;
+    using LP = __attribute__((address_space(3))) float*;
+
+    #if __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
+        if (__builtin_amdgcn_is_shared((GP) address)) {
+            return __builtin_amdgcn_ds_faddf((LP) address, val, 0, 0, false);
+        }
+    #endif
+
+    return atomicAdd_impl(address, val);
+}
 __device__
 inline
 double atomicAdd(double* address, double val)
diff --git a/hipamd/tests/src/deviceLib/hipSimpleAtomicsTest.cpp b/hipamd/tests/src/deviceLib/hipSimpleAtomicsTest.cpp
index 760d65e555..5a09799c0e 100644
--- a/hipamd/tests/src/deviceLib/hipSimpleAtomicsTest.cpp
+++ b/hipamd/tests/src/deviceLib/hipSimpleAtomicsTest.cpp
@@ -29,6 +29,7 @@ THE SOFTWARE.
 
 // includes, system
 #include <algorithm>
+#include <cstring>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -212,7 +213,7 @@ __device__
 void testKernelSub(...) {}
 
 template<
-    typename T, 
+    typename T,
     typename enable_if<
         is_same<T, int>{} || is_same<T, unsigned int>{}>::type* = nullptr>
 __device__
@@ -259,13 +260,37 @@ void testKernelIntegral(T* g_odata) {
     testKernelSub(g_odata);
 }
 
+namespace {
+    constexpr unsigned int numData = 11;
+}
+
 template<typename T>
-__global__ void testKernel(T* g_odata) {
+__global__ void testKernel(T* g_odata, T* g_sdata) {
     // Atomic addition
     atomicAdd(&g_odata[0], 10);
 
     testKernelIntegral(g_odata);
     testKernelExch(g_odata);
+
+    #if !defined(HIP_PLATFORM_NVCC)
+        // Shared Atomic addition.
+        __shared__ T s_odata[numData];
+
+        if (threadIdx.x == 0) { s_odata[8] = s_odata[10] = 0xff; }
+
+        __syncthreads();
+
+        atomicAdd(&s_odata[0], 10);
+
+        testKernelIntegral(s_odata);
+        testKernelExch(s_odata);
+
+        __syncthreads();
+
+        if (threadIdx.x == 0) {
+            __builtin_memcpy(g_sdata, s_odata, sizeof(T) * numData);
+        }
+    #endif
 }
 
 template<typename T>
@@ -286,33 +311,38 @@ void runTest() {
 
     unsigned int numThreads = 256;
     unsigned int numBlocks = 64;
-    unsigned int numData = 11;
     unsigned int memSize = sizeof(T) * numData;
 
     // allocate mem for the result on host side
     T* hOData = (T*)malloc(memSize);
+    T* hSData = (T*)malloc(memSize);
 
     // initialize the memory
-    for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
+    std::memset(hOData, 0, memSize);
+    std::memset(hSData, 0, memSize);
 
     // To make the AND and XOR tests generate something other than 0...
     hOData[8] = hOData[10] = 0xff;
 
     // allocate device memory for result
     T* dOData;
+    T* dSData;
     hipMalloc((void**)&dOData, memSize);
+    hipMalloc((void**)&dSData, memSize);
     // copy host memory to device to initialize to zero
     hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice);
 
     // execute the kernel
     hipLaunchKernelGGL(
-        testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
+        testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData, dSData);
 
     // Copy result from device to host
     hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost);
+    hipMemcpy(hSData, dSData, memSize, hipMemcpyDeviceToHost);
 
     // Compute reference solution
-    testResult = computeGold(hOData, numThreads * numBlocks);
+    testResult = std::equal(hOData, hOData + numData, hSData) &&
+                 computeGold(hOData, numThreads * numBlocks);
 
     // Cleanup memory
     free(hOData);