diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt
index 14dba84e46..5f651d6dd7 100755
--- a/projects/clr/hipamd/CMakeLists.txt
+++ b/projects/clr/hipamd/CMakeLists.txt
@@ -47,7 +47,6 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 #############################
 option(BUILD_HIPIFY_CLANG "Enable building the CUDA->HIP converter" OFF)
 option(__HIP_ENABLE_PCH "Enable/Disable pre-compiled hip headers" ON)
-option(__HIP_USE_CMPXCHG_FOR_FP_ATOMICS "Enable/Disable FP Atomics version" OFF)
 option(HIP_OFFICIAL_BUILD "Enable/Disable for mainline/staging builds" OFF)
 option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" ON)
 set(HIPCC_BIN_DIR "" CACHE STRING "HIPCC and HIPCONFIG binary directories")
@@ -58,10 +57,6 @@ else()
   set(_pchStatus 0)
 endif()
 
-if(__HIP_USE_CMPXCHG_FOR_FP_ATOMICS)
-  add_definitions(-D__HIP_USE_CMPXCHG_FOR_FP_ATOMICS)
-endif()
-
 message(STATUS "HIPCC_BIN_DIR found at ${HIPCC_BIN_DIR}")
 message(STATUS "HIP_COMMON_DIR found at ${HIP_COMMON_DIR}")
 set(HIP_COMMON_INCLUDE_DIR ${HIP_COMMON_DIR}/include)
diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_atomic.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_atomic.h
index 1a9a71338f..80e0b3f2a8 100644
--- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_atomic.h
+++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_atomic.h
@@ -186,7 +186,11 @@ unsigned long long atomicAdd_system(unsigned long long* address, unsigned long l
 __device__
 inline
 float atomicAdd(float* address, float val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicAdd(address, val);
+#else
   return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#endif
 }
 
 __device__
@@ -208,7 +212,11 @@ void atomicAddNoRet(float* address, float val)
 __device__
 inline
 double atomicAdd(double* address, double val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicAdd(address, val);
+#else
   return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#endif
 }
 
 __device__
@@ -268,7 +276,11 @@ unsigned long long atomicSub_system(unsigned long long* address, unsigned long l
 __device__
 inline
 float atomicSub(float* address, float val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicAdd(address, -val);
+#else
   return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#endif
 }
 
 __device__
@@ -280,7 +292,11 @@ float atomicSub_system(float* address, float val) {
 __device__
 inline
 double atomicSub(double* address, double val) {
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+  return unsafeAtomicAdd(address, -val);
+#else
   return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#endif
 }
 
 __device__
@@ -836,24 +852,10 @@ __device__
 inline
 float atomicAdd(float* address, float val)
 {
-#ifndef __HIP_USE_CMPXCHG_FOR_FP_ATOMICS
-    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+    return unsafeAtomicAdd(address, val);
 #else
-    unsigned int* uaddr{reinterpret_cast<unsigned int*>(address)};
-    unsigned int r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
-
-    unsigned int old;
-    do {
-        old = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
-
-        if (r != old) { r = old; continue; }
-
-        r = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r)));
-
-        if (r == old) break;
-    } while (true);
-
-    return __uint_as_float(r);
+    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
 #endif
 }
 
@@ -871,25 +873,10 @@ __device__
 inline
 double atomicAdd(double* address, double val)
 {
-#ifndef __HIP_USE_CMPXCHG_FOR_FP_ATOMICS
-    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
+    return unsafeAtomicAdd(address, val);
 #else
-    unsigned long long* uaddr{reinterpret_cast<unsigned long long*>(address)};
-    unsigned long long r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
-
-    unsigned long long old;
-    do {
-        old = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
-
-        if (r != old) { r = old; continue; }
-
-        r = atomicCAS(
-            uaddr, r, __double_as_longlong(val + __longlong_as_double(r)));
-
-        if (r == old) break;
-    } while (true);
-
-    return __longlong_as_double(r);
+    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
 #endif
 }
 
diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h
index bf36a3f6ae..ba5795897b 100644
--- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h
+++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h
@@ -24,13 +24,14 @@ THE SOFTWARE.
 
 #ifdef __cplusplus
 /**
- * @brief Unsafe floating point rmw atomic add for gfx90a.
+ * @brief Unsafe floating point rmw atomic add.
  *
  * Performs a relaxed read-modify-write floating point atomic add with
  * device memory scope. Original value at \p addr is returned and
  * the value of \p addr is updated to have the original value plus \p value
  *
- * @note This operation is only suppored for the gfx90a target.
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
  *
  * It can be used to generate code that uses fast hardware floating point atomic
  * operations which may handle rounding and subnormal values differently than
@@ -40,35 +41,47 @@ THE SOFTWARE.
  * following condition are met:
  *
  * - \p addr is at least 4 bytes aligned
- * - \p addr is a global segment address in a coarse grain allocation.
- * Global segment addresses in fine grain allocations, group segment addresses,
- * and private segment addresses (used for function argument and function local
- * variables) are not supported.
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and is not supported.
  *
  * @param [in,out] addr Pointer to value to be increment by \p value.
  * @param [in] value Value by \p addr is to be incremented.
  * @return Original value contained in \p addr.
  */
-#if __has_builtin(__builtin_amdgcn_is_shared) &&                               \
+__device__ inline float unsafeAtomicAdd(float* addr, float value) {
+#if defined(__gfx90a__) &&                                                     \
+    __has_builtin(__builtin_amdgcn_is_shared) &&                               \
+    __has_builtin(__builtin_amdgcn_is_private) &&                              \
     __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) &&                      \
     __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
-__device__ inline float unsafeAtomicAdd(float* addr, float value) {
   if (__builtin_amdgcn_is_shared(
-          (const __attribute__((address_space(0))) void*)addr))
+        (const __attribute__((address_space(0))) void*)addr))
     return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
+  else if (__builtin_amdgcn_is_private(
+              (const __attribute__((address_space(0))) void*)addr)) {
+    float temp = *addr;
+    *addr = temp + value;
+    return temp;
+  }
   else
     return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
-}
+#elif __has_builtin(__hip_atomic_fetch_add)
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
 #endif
+}
 
 /**
- * @brief Unsafe double precision rmw atomic add for gfx90a.
+ * @brief Unsafe double precision rmw atomic add.
  *
  * Performs a relaxed read-modify-write double precision atomic add with
  * device memory scope. Original value at \p addr is returned and
  * the value of \p addr is updated to have the original value plus \p value
  *
- * @note This operation is only suppored for the gfx90a target.
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
  *
  * It can be used to generate code that uses fast hardware floating point atomic
  * operations which may handle rounding and subnormal values differently than
@@ -78,24 +91,128 @@ __device__ inline float unsafeAtomicAdd(float* addr, float value) {
  * following condition are met:
  *
  * - \p addr is at least 8 byte aligned
- * - \p addr is a global segment address in a coarse grain allocation.
- * Global segment addresses in fine grain allocations, group segment addresses,
- * and private segment addresses (used for function argument and function local
- * variables) are not supported.
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and are not supported.
  *
  * @param [in,out] addr Pointer to value to be increment by \p value.
  * @param [in] value Value by \p addr is to be incremented.
  * @return Original value contained in \p addr.
  */
-#if __has_builtin(__builtin_amdgcn_is_shared) &&                               \
-    __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f64) &&                      \
-    __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
 __device__ inline double unsafeAtomicAdd(double* addr, double value) {
-  if (__builtin_amdgcn_is_shared(
-          (const __attribute__((address_space(0))) void*)addr))
-    return __builtin_amdgcn_ds_atomic_fadd_f64(addr, value);
-  else
-    return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
+#if defined(__gfx90a__) &&                                                     \
+    __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
+  return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
+#elif defined (__hip_atomic_fetch_add)
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif
+}
+
+/**
+ * @brief Safe floating point rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write floating point atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be increment by \p value.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float safeAtomicAdd(float* addr, float value) {
+#if defined(__gfx908__) ||                                                    \
+    (defined(__gfx90a) && !__has_builtin(__hip_atomic_fetch_add))
+  // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
+  // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
+  // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
+  // force a CAS loop here.
+  float old_val;
+#if __has_builtin(__hip_atomic_load)
+  old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else // !__has_builtin(__hip_atomic_load)
+  old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
+#endif // __has_builtin(__hip_atomic_load)
+  float expected, temp;
+  do {
+    temp = expected = old_val;
+#if __has_builtin(__hip_atomic_compare_exchange_strong)
+    __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
+                                         __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
+    __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
+                                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
+    old_val = expected;
+  } while (__float_as_uint(temp) != __float_as_uint(old_val));
+  return old_val;
+#elif defined(__gfx90a__)
+  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
+  // atomics will produce safe CAS loops, but are otherwise not different than
+  // agent-scope atomics. This logic is only applicable for gfx90a, and should
+  // not be assumed on other architectures.
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+#elif __has_builtin(__hip_atomic_fetch_add)
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif
+}
+
+/**
+ * @brief Safe double precision rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write double precision atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be increment by \p value.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline double safeAtomicAdd(double* addr, double value) {
+#if defined(__gfx90a__) &&                                                    \
+    __has_builtin(__hip_atomic_fetch_add)
+  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
+  // atomics will produce safe CAS loops, but are otherwise not different than
+  // agent-scope atomics. This logic is only applicable for gfx90a, and should
+  // not be assumed on other architectures.
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+#elif defined(__gfx90a__)
+  // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
+  // force a CAS loop here.
+  double old_val;
+#if __has_builtin(__hip_atomic_load)
+  old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else // !__has_builtin(__hip_atomic_load)
+  old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
+#endif // __has_builtin(__hip_atomic_load)
+  double expected, temp;
+  do {
+    temp = expected = old_val;
+#if __has_builtin(__hip_atomic_compare_exchange_strong)
+    __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
+                                         __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
+    __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
+                                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
+    old_val = expected;
+  } while (__double_as_longlong(temp) != __double_as_longlong(old_val));
+  return old_val;
+#else // !defined(__gfx90a__)
+#if __has_builtin(__hip_atomic_fetch_add)
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else  // !__has_builtin(__hip_atomic_fetch_add)
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif // __has_builtin(__hip_atomic_fetch_add)
+#endif
 }
 #endif
-#endif
diff --git a/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h b/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h
index 007fc70085..b1002b71dd 100644
--- a/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h
+++ b/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <cuda_runtime.h>
 
 #include <hip/hip_runtime_api.h>
+#include "nvidia_hip_unsafe_atomics.h"
 
 #define HIP_KERNEL_NAME(...) __VA_ARGS__
 
diff --git a/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h b/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h
new file mode 100644
index 0000000000..919353129a
--- /dev/null
+++ b/projects/clr/hipamd/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h
@@ -0,0 +1,68 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H
+
+__device__ inline float unsafeAtomicAdd(float* addr, float value) {
+    return atomicAdd(addr, value);
+}
+
+__device__ inline double unsafeAtomicAdd(double* addr, double value) {
+#if __CUDA_ARCH__ < 600
+    unsigned long long *addr_cast = (unsigned long long*)addr;
+    unsigned long long old_val = *addr_cast;
+    unsigned long long expected;
+    do {
+        expected = old_val;
+        old_val = atomicCAS(addr_cast, expected,
+                            __double_as_longlong(value +
+                                                 __longlong_as_double(expected)));
+    } while (__double_as_longlong(expected) != __double_as_longlong(old_val));
+    return old_val;
+#else
+    return atomicAdd(addr, value);
+#endif
+}
+
+__device__ inline float safeAtomicAdd(float* addr, float value) {
+    return atomicAdd(addr, value);
+}
+
+__device__ inline double safeAtomicAdd(double* addr, double value) {
+#if __CUDA_ARCH__ < 600
+    unsigned long long *addr_cast = (unsigned long long*)addr;
+    unsigned long long old_val = *addr_cast;
+    unsigned long long expected;
+    do {
+        expected = old_val;
+        old_val = atomicCAS(addr_cast, expected,
+                            __double_as_longlong(value +
+                                                 __longlong_as_double(expected)));
+    } while (__double_as_longlong(expected) != __double_as_longlong(old_val));
+    return old_val;
+#else
+    return atomicAdd(addr, value);
+#endif
+}
+
+#endif