From 4e564f783e48aae354820f02157d962c6f7ee307 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 29 Nov 2019 11:58:12 +0530 Subject: [PATCH] Revert changes for atomic FADD support when address is in LDS (#1701) This reverts PR #1591 and follow-on PR #1695 [ROCm/hip commit: 32442c65064f835f4acaa716ca246e100d69500b] --- .../hip/include/hip/hcc_detail/hip_atomic.h | 30 +------------ .../src/deviceLib/hipSimpleAtomicsTest.cpp | 42 +++---------------- 2 files changed, 7 insertions(+), 65 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_atomic.h b/projects/hip/include/hip/hcc_detail/hip_atomic.h index 699dbe7816..263f639e96 100644 --- a/projects/hip/include/hip/hcc_detail/hip_atomic.h +++ b/projects/hip/include/hip/hcc_detail/hip_atomic.h @@ -55,7 +55,7 @@ unsigned long long atomicAdd( } __device__ inline -float atomicAdd_impl(float* address, float val) +float atomicAdd(float* address, float val) { unsigned int* uaddr{reinterpret_cast(address)}; unsigned int r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; @@ -75,34 +75,6 @@ float atomicAdd_impl(float* address, float val) } __device__ inline -bool __hip_is_shared(const __attribute__((address_space(0))) void* ptr) noexcept -{ // TODO: this is ersatz for __builtin_amdgcn_is_shared. - #if defined(__HIP_DEVICE_COMPILE__) - const unsigned int gp = reinterpret_cast(ptr); - - return - gp ==(__builtin_amdgcn_s_getreg((15 << 11) | (16 << 6) | 15) << 16); - #else - return false; - #endif -} -__device__ -inline -float atomicAdd(float* address, float val) -{ - using GP = const __attribute__((address_space(0))) void*; - using LP = __attribute__((address_space(3))) float*; - - #if __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ - if (__hip_is_shared((GP) address)) { - return __builtin_amdgcn_ds_faddf((LP) address, val, 0, 0, false); - } - #endif - - return atomicAdd_impl(address, val); -} -__device__ -inline double atomicAdd(double* address, double val) { unsigned long long* uaddr{reinterpret_cast(address)}; diff --git a/projects/hip/tests/src/deviceLib/hipSimpleAtomicsTest.cpp b/projects/hip/tests/src/deviceLib/hipSimpleAtomicsTest.cpp index 5a09799c0e..760d65e555 100644 --- a/projects/hip/tests/src/deviceLib/hipSimpleAtomicsTest.cpp +++ b/projects/hip/tests/src/deviceLib/hipSimpleAtomicsTest.cpp @@ -29,7 +29,6 @@ THE SOFTWARE. // includes, system #include -#include #include #include #include @@ -213,7 +212,7 @@ __device__ void testKernelSub(...) {} template< - typename T, + typename T, typename enable_if< is_same{} || is_same{}>::type* = nullptr> __device__ @@ -260,37 +259,13 @@ void testKernelIntegral(T* g_odata) { testKernelSub(g_odata); } -namespace { - constexpr unsigned int numData = 11; -} - template -__global__ void testKernel(T* g_odata, T* g_sdata) { +__global__ void testKernel(T* g_odata) { // Atomic addition atomicAdd(&g_odata[0], 10); testKernelIntegral(g_odata); testKernelExch(g_odata); - - #if !defined(HIP_PLATFORM_NVCC) - // Shared Atomic addition. - __shared__ T s_odata[numData]; - - if (threadIdx.x == 0) { s_odata[8] = s_odata[10] = 0xff; } - - __syncthreads(); - - atomicAdd(&s_odata[0], 10); - - testKernelIntegral(s_odata); - testKernelExch(s_odata); - - __syncthreads(); - - if (threadIdx.x == 0) { - __builtin_memcpy(g_sdata, s_odata, sizeof(T) * numData); - } - #endif } template @@ -311,38 +286,33 @@ void runTest() { unsigned int numThreads = 256; unsigned int numBlocks = 64; + unsigned int numData = 11; unsigned int memSize = sizeof(T) * numData; // allocate mem for the result on host side T* hOData = (T*)malloc(memSize); - T* hSData = (T*)malloc(memSize); // initialize the memory - std::memset(hOData, 0, memSize); - std::memset(hSData, 0, memSize); + for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; // To make the AND and XOR tests generate something other than 0... hOData[8] = hOData[10] = 0xff; // allocate device memory for result T* dOData; - T* dSData; hipMalloc((void**)&dOData, memSize); - hipMalloc((void**)&dSData, memSize); // copy host memory to device to initialize to zero hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice); // execute the kernel hipLaunchKernelGGL( - testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData, dSData); + testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData); // Copy result from device to host hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost); - hipMemcpy(hSData, dSData, memSize, hipMemcpyDeviceToHost); // Compute reference solution - testResult = std::equal(hOData, hOData + numData, hSData) && - computeGold(hOData, numThreads * numBlocks); + testResult = computeGold(hOData, numThreads * numBlocks); // Cleanup memory free(hOData);