diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt index 14dba84e46..5f651d6dd7 100755 --- a/projects/clr/hipamd/CMakeLists.txt +++ b/projects/clr/hipamd/CMakeLists.txt @@ -47,7 +47,6 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) ############################# option(BUILD_HIPIFY_CLANG "Enable building the CUDA->HIP converter" OFF) option(__HIP_ENABLE_PCH "Enable/Disable pre-compiled hip headers" ON) -option(__HIP_USE_CMPXCHG_FOR_FP_ATOMICS "Enable/Disable FP Atomics version" OFF) option(HIP_OFFICIAL_BUILD "Enable/Disable for mainline/staging builds" OFF) option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" ON) set(HIPCC_BIN_DIR "" CACHE STRING "HIPCC and HIPCONFIG binary directories") @@ -58,10 +57,6 @@ else() set(_pchStatus 0) endif() -if(__HIP_USE_CMPXCHG_FOR_FP_ATOMICS) - add_definitions(-D__HIP_USE_CMPXCHG_FOR_FP_ATOMICS) -endif() - message(STATUS "HIPCC_BIN_DIR found at ${HIPCC_BIN_DIR}") message(STATUS "HIP_COMMON_DIR found at ${HIP_COMMON_DIR}") set(HIP_COMMON_INCLUDE_DIR ${HIP_COMMON_DIR}/include) diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_atomic.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_atomic.h index 1a9a71338f..80e0b3f2a8 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_atomic.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_atomic.h @@ -186,7 +186,11 @@ unsigned long long atomicAdd_system(unsigned long long* address, unsigned long l __device__ inline float atomicAdd(float* address, float val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, val); +#else return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif } __device__ @@ -208,7 +212,11 @@ void atomicAddNoRet(float* address, float val) __device__ inline double atomicAdd(double* address, double val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, val); +#else return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif } __device__ @@ -268,7 +276,11 @@ unsigned long long atomicSub_system(unsigned long long* address, unsigned long l __device__ inline float atomicSub(float* address, float val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, -val); +#else return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif } __device__ @@ -280,7 +292,11 @@ float atomicSub_system(float* address, float val) { __device__ inline double atomicSub(double* address, double val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, -val); +#else return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif } __device__ @@ -836,24 +852,10 @@ __device__ inline float atomicAdd(float* address, float val) { -#ifndef __HIP_USE_CMPXCHG_FOR_FP_ATOMICS - return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, val); #else - unsigned int* uaddr{reinterpret_cast(address)}; - unsigned int r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; - - unsigned int old; - do { - old = __atomic_load_n(uaddr, __ATOMIC_RELAXED); - - if (r != old) { r = old; continue; } - - r = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r))); - - if (r == old) break; - } while (true); - - return __uint_as_float(r); + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); #endif } @@ -871,25 +873,10 @@ __device__ inline double atomicAdd(double* address, double val) { -#ifndef __HIP_USE_CMPXCHG_FOR_FP_ATOMICS - return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, val); #else - unsigned long long* uaddr{reinterpret_cast(address)}; - unsigned long long r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; - - unsigned long long old; - do { - old = __atomic_load_n(uaddr, __ATOMIC_RELAXED); - - if (r != old) { r = old; continue; } - - r = atomicCAS( - uaddr, r, __double_as_longlong(val + __longlong_as_double(r))); - - if (r == old) break; - } while (true); - - return __longlong_as_double(r); + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); #endif } diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h index bf36a3f6ae..ba5795897b 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h @@ -24,13 +24,14 @@ THE SOFTWARE. #ifdef __cplusplus /** - * @brief Unsafe floating point rmw atomic add for gfx90a. + * @brief Unsafe floating point rmw atomic add. * * Performs a relaxed read-modify-write floating point atomic add with * device memory scope. Original value at \p addr is returned and * the value of \p addr is updated to have the original value plus \p value * - * @note This operation is only suppored for the gfx90a target. + * @note This operation currently only performs different operations for + * the gfx90a target. Other devices continue to use safe atomics. * * It can be used to generate code that uses fast hardware floating point atomic * operations which may handle rounding and subnormal values differently than @@ -40,35 +41,47 @@ THE SOFTWARE. * following condition are met: * * - \p addr is at least 4 bytes aligned - * - \p addr is a global segment address in a coarse grain allocation. - * Global segment addresses in fine grain allocations, group segment addresses, - * and private segment addresses (used for function argument and function local - * variables) are not supported. + * - If \p addr is a global segment address, it is in a coarse grain allocation. + * Passing in global segment addresses in fine grain allocations will result in + * undefined behavior and is not supported. * * @param [in,out] addr Pointer to value to be increment by \p value. * @param [in] value Value by \p addr is to be incremented. * @return Original value contained in \p addr. */ -#if __has_builtin(__builtin_amdgcn_is_shared) && \ +__device__ inline float unsafeAtomicAdd(float* addr, float value) { +#if defined(__gfx90a__) && \ + __has_builtin(__builtin_amdgcn_is_shared) && \ + __has_builtin(__builtin_amdgcn_is_private) && \ __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \ __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32) -__device__ inline float unsafeAtomicAdd(float* addr, float value) { if (__builtin_amdgcn_is_shared( - (const __attribute__((address_space(0))) void*)addr)) + (const __attribute__((address_space(0))) void*)addr)) return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value); + else if (__builtin_amdgcn_is_private( + (const __attribute__((address_space(0))) void*)addr)) { + float temp = *addr; + *addr = temp + value; + return temp; + } else return __builtin_amdgcn_global_atomic_fadd_f32(addr, value); -} +#elif __has_builtin(__hip_atomic_fetch_add) + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED); #endif +} /** - * @brief Unsafe double precision rmw atomic add for gfx90a. + * @brief Unsafe double precision rmw atomic add. * * Performs a relaxed read-modify-write double precision atomic add with * device memory scope. Original value at \p addr is returned and * the value of \p addr is updated to have the original value plus \p value * - * @note This operation is only suppored for the gfx90a target. + * @note This operation currently only performs different operations for + * the gfx90a target. Other devices continue to use safe atomics. * * It can be used to generate code that uses fast hardware floating point atomic * operations which may handle rounding and subnormal values differently than @@ -78,24 +91,128 @@ __device__ inline float unsafeAtomicAdd(float* addr, float value) { * following condition are met: * * - \p addr is at least 8 byte aligned - * - \p addr is a global segment address in a coarse grain allocation. - * Global segment addresses in fine grain allocations, group segment addresses, - * and private segment addresses (used for function argument and function local - * variables) are not supported. + * - If \p addr is a global segment address, it is in a coarse grain allocation. + * Passing in global segment addresses in fine grain allocations will result in + * undefined behavior and are not supported. * * @param [in,out] addr Pointer to value to be increment by \p value. * @param [in] value Value by \p addr is to be incremented. * @return Original value contained in \p addr. */ -#if __has_builtin(__builtin_amdgcn_is_shared) && \ - __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f64) && \ - __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64) __device__ inline double unsafeAtomicAdd(double* addr, double value) { - if (__builtin_amdgcn_is_shared( - (const __attribute__((address_space(0))) void*)addr)) - return __builtin_amdgcn_ds_atomic_fadd_f64(addr, value); - else - return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value); +#if defined(__gfx90a__) && \ + __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64) + return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value); +#elif defined (__hip_atomic_fetch_add) + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED); +#endif +} + +/** + * @brief Safe floating point rmw atomic add. + * + * Performs a relaxed read-modify-write floating point atomic add with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated to have the original value plus \p value + * + * @note This operation ensures that, on all targets, we produce safe atomics. + * This will be the case even when -munsafe-fp-atomics is passed into the compiler. + * + * @param [in,out] addr Pointer to value to be increment by \p value. + * @param [in] value Value by \p addr is to be incremented. + * @return Original value contained in \p addr. + */ +__device__ inline float safeAtomicAdd(float* addr, float value) { +#if defined(__gfx908__) || \ + (defined(__gfx90a) && !__has_builtin(__hip_atomic_fetch_add)) + // On gfx908, we can generate unsafe FP32 atomic add that does not follow all + // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead. + // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to + // force a CAS loop here. + float old_val; +#if __has_builtin(__hip_atomic_load) + old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_load) + old_val = __uint_as_float(__atomic_load_n(reinterpret_cast(addr), __ATOMIC_RELAXED)); +#endif // __has_builtin(__hip_atomic_load) + float expected, temp; + do { + temp = expected = old_val; +#if __has_builtin(__hip_atomic_compare_exchange_strong) + __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED, + __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_compare_exchange_strong) + __atomic_compare_exchange_n(addr, &expected, old_val + value, false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); +#endif // __has_builtin(__hip_atomic_compare_exchange_strong) + old_val = expected; + } while (__float_as_uint(temp) != __float_as_uint(old_val)); + return old_val; +#elif defined(__gfx90a__) + // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope + // atomics will produce safe CAS loops, but are otherwise not different than + // agent-scope atomics. This logic is only applicable for gfx90a, and should + // not be assumed on other architectures. + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#elif __has_builtin(__hip_atomic_fetch_add) + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED); +#endif +} + +/** + * @brief Safe double precision rmw atomic add. + * + * Performs a relaxed read-modify-write double precision atomic add with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated to have the original value plus \p value + * + * @note This operation ensures that, on all targets, we produce safe atomics. + * This will be the case even when -munsafe-fp-atomics is passed into the compiler. + * + * @param [in,out] addr Pointer to value to be increment by \p value. + * @param [in] value Value by \p addr is to be incremented. + * @return Original value contained in \p addr. + */ +__device__ inline double safeAtomicAdd(double* addr, double value) { +#if defined(__gfx90a__) && \ + __has_builtin(__hip_atomic_fetch_add) + // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope + // atomics will produce safe CAS loops, but are otherwise not different than + // agent-scope atomics. This logic is only applicable for gfx90a, and should + // not be assumed on other architectures. + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#elif defined(__gfx90a__) + // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to + // force a CAS loop here. + double old_val; +#if __has_builtin(__hip_atomic_load) + old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_load) + old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast(addr), __ATOMIC_RELAXED)); +#endif // __has_builtin(__hip_atomic_load) + double expected, temp; + do { + temp = expected = old_val; +#if __has_builtin(__hip_atomic_compare_exchange_strong) + __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED, + __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_compare_exchange_strong) + __atomic_compare_exchange_n(addr, &expected, old_val + value, false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); +#endif // __has_builtin(__hip_atomic_compare_exchange_strong) + old_val = expected; + } while (__double_as_longlong(temp) != __double_as_longlong(old_val)); + return old_val; +#else // !defined(__gfx90a__) +#if __has_builtin(__hip_atomic_fetch_add) + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_fetch_add) + return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED); +#endif // __has_builtin(__hip_atomic_fetch_add) +#endif } #endif -#endif diff --git a/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h b/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h index 007fc70085..b1002b71dd 100644 --- a/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h +++ b/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include +#include "nvidia_hip_unsafe_atomics.h" #define HIP_KERNEL_NAME(...) __VA_ARGS__ diff --git a/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h b/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h new file mode 100644 index 0000000000..919353129a --- /dev/null +++ b/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h @@ -0,0 +1,68 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H +#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H + +__device__ inline float unsafeAtomicAdd(float* addr, float value) { + return atomicAdd(addr, value); +} + +__device__ inline double unsafeAtomicAdd(double* addr, double value) { +#if __CUDA_ARCH__ < 600 + unsigned long long *addr_cast = (unsigned long long*)addr; + unsigned long long old_val = *addr_cast; + unsigned long long expected; + do { + expected = old_val; + old_val = atomicCAS(addr_cast, expected, + __double_as_longlong(value + + __longlong_as_double(expected))); + } while (__double_as_longlong(expected) != __double_as_longlong(old_val)); + return old_val; +#else + return atomicAdd(addr, value); +#endif +} + +__device__ inline float safeAtomicAdd(float* addr, float value) { + return atomicAdd(addr, value); +} + +__device__ inline double safeAtomicAdd(double* addr, double value) { +#if __CUDA_ARCH__ < 600 + unsigned long long *addr_cast = (unsigned long long*)addr; + unsigned long long old_val = *addr_cast; + unsigned long long expected; + do { + expected = old_val; + old_val = atomicCAS(addr_cast, expected, + __double_as_longlong(value + + __longlong_as_double(expected))); + } while (__double_as_longlong(expected) != __double_as_longlong(old_val)); + return old_val; +#else + return atomicAdd(addr, value); +#endif +} + +#endif