From d9ecde1b60eed53e687d305cd4e6e02bef3c85a7 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Fri, 17 Nov 2017 16:00:28 +0000
Subject: [PATCH 1/2] This fixes some outright quaint choices made when
 implementing HIP's bitwise conversion functions, by using simple
 reinterpret_casts, as is idiomatic. These functions are supposed to be
 re-entrant, correct and efficient. Sadly, they were neither: they hid a
 massive race condition against a value stored in global memory, which means
 that they were also unreasonably slow if they ever managed to be correct, and
 relied on union based type punning which is in a grey area of the standard.
 It is difficult to ascertain what may have been the reason for coming up with
 this quirky solution.

[ROCm/hip commit: 153878e368d4e2966bd5c2524e1fd8c3e1b686ff]
---
 projects/hip/src/device_functions.cpp | 53 ++++++---------------------
 1 file changed, 12 insertions(+), 41 deletions(-)
diff --git a/projects/hip/src/device_functions.cpp b/projects/hip/src/device_functions.cpp
index 615ae4d0b7..a66cc1e9fb 100644
--- a/projects/hip/src/device_functions.cpp
+++ b/projects/hip/src/device_functions.cpp
@@ -23,27 +23,6 @@ THE SOFTWARE.
 #include <hc_math.hpp>
 #include "device_util.h"
 
-struct holder64Bit{
-  union{
-    double d;
-    unsigned long int uli;
-    signed long int sli;
-    signed int si[2];
-    unsigned int ui[2];
-  };
-} __attribute__((aligned(8)));
-
-struct holder32Bit {
-  union {
-    float f;
-    unsigned int ui;
-    signed int si;
-  };
-} __attribute__((aligned(4)));
-
-__device__ struct holder64Bit hold64;
-__device__ struct holder32Bit hold32;
-
 __device__ float __double2float_rd(double x)
 {
   return (double)x;
@@ -64,13 +43,11 @@ __device__ float __double2float_rz(double x)
 
 __device__ int __double2hiint(double x)
 {
-  hold64.d = x;
-  return hold64.si[1];
+  return reinterpret_cast<int(&)[2]>(x)[1];
 }
 __device__ int __double2loint(double x)
 {
-  hold64.d = x;
-  return hold64.si[0];
+  return reinterpret_cast<int(&)[2]>(x)[0];
 }
 
 
@@ -145,8 +122,7 @@ __device__ unsigned long long int __double2ull_rz(double x)
 
 __device__ long long int __double_as_longlong(double x)
 {
-  hold64.d = x;
-  return hold64.sli;
+  return reinterpret_cast<long long&>(x);
 }
 
 __device__ int __float2int_rd(float x)
@@ -219,19 +195,17 @@ __device__ unsigned long long int __float2ull_rz(float x)
 
 __device__ int __float_as_int(float x)
 {
-  hold32.f = x;
-  return hold32.si;
+  return reinterpret_cast<int&>(x);
 }
 __device__ unsigned int __float_as_uint(float x)
 {
-  hold32.f = x;
-  return hold32.ui;
+  return reinterpret_cast<unsigned int&>(x);
 }
 __device__ double __hiloint2double(int hi, int lo)
-{
-  hold64.si[1] = hi;
-  hold64.si[0] = lo;
-  return hold64.d;
+{ // TODO: this matches the original in not considering endianness, is that
+  //       correct though?
+  int tmp[] = {lo, hi};
+  return reinterpret_cast<double&>(tmp);
 }
 __device__ double __int2double_rn(int x)
 {
@@ -257,8 +231,7 @@ __device__ float __int2float_rz(int x)
 
 __device__ float __int_as_float(int x)
 {
-  hold32.si = x;
-  return hold32.f;
+  return reinterpret_cast<float&>(x);
 }
 
 __device__ double __ll2double_rd(long long int x)
@@ -297,8 +270,7 @@ __device__ float __ll2float_rz(long long int x)
 
 __device__ double __longlong_as_double(long long int x)
 {
-  hold64.sli = x;
-  return hold64.d;
+  return reinterpret_cast<double&>(x);
 }
 
 __device__ double __uint2double_rn(int x)
@@ -325,8 +297,7 @@ __device__ float __uint2float_rz(unsigned int x)
 
 __device__ float __uint_as_float(unsigned int x)
 {
-  hold32.ui = x;
-  return hold32.f;
+  return reinterpret_cast<float&>(x);
 }
 
 __device__ double __ull2double_rd(unsigned long long int x)

From 6de4a217c0725a0129913c37a05f4a79f850a70d Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Sat, 18 Nov 2017 01:16:31 +0000
Subject: [PATCH 2/2] This actually (tries) to do the right thing all the way,
 by using memcpy for bitcasting, and not rely on undefined behaviour of a
 different flavour as a substitute for the original undefined behaviour. Note
 that the compiler will (should) optimise down to the same emitted code, since
 this is a pattern it understands.

[ROCm/hip commit: 6fa7adf077870ccc40aaf606a941d6fc938207cf]
---
 projects/hip/src/device_functions.cpp | 71 ++++++++++++++++++++++-----
 1 file changed, 58 insertions(+), 13 deletions(-)

diff --git a/projects/hip/src/device_functions.cpp b/projects/hip/src/device_functions.cpp
index a66cc1e9fb..63425bc9f4 100644
--- a/projects/hip/src/device_functions.cpp
+++ b/projects/hip/src/device_functions.cpp
@@ -43,11 +43,21 @@ __device__ float __double2float_rz(double x)
 
 __device__ int __double2hiint(double x)
 {
-  return reinterpret_cast<int(&)[2]>(x)[1];
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &x, sizeof(tmp));
+
+  return tmp[1];
 }
 __device__ int __double2loint(double x)
 {
-  return reinterpret_cast<int(&)[2]>(x)[0];
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &x, sizeof(tmp));
+
+  return tmp[0];
 }
 
 
@@ -122,7 +132,12 @@ __device__ unsigned long long int __double2ull_rz(double x)
 
 __device__ long long int __double_as_longlong(double x)
 {
-  return reinterpret_cast<long long&>(x);
+  static_assert(sizeof(long long) == sizeof(double), "");
+
+  long long tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
 }
 
 __device__ int __float2int_rd(float x)
@@ -195,17 +210,32 @@ __device__ unsigned long long int __float2ull_rz(float x)
 
 __device__ int __float_as_int(float x)
 {
-  return reinterpret_cast<int&>(x);
+  static_assert(sizeof(int) == sizeof(float), "");
+
+  int tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
 }
 __device__ unsigned int __float_as_uint(float x)
 {
-  return reinterpret_cast<unsigned int&>(x);
+  static_assert(sizeof(unsigned int) == sizeof(float), "");
+
+  unsigned int tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
 }
-__device__ double __hiloint2double(int hi, int lo)
-{ // TODO: this matches the original in not considering endianness, is that
-  //       correct though?
-  int tmp[] = {lo, hi};
-  return reinterpret_cast<double&>(tmp);
+__device__ double __hiloint2double(int32_t hi, int32_t lo)
+{
+  static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+  uint64_t tmp0 =
+    (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+
+  return tmp1;
 }
 __device__ double __int2double_rn(int x)
 {
@@ -231,7 +261,12 @@ __device__ float __int2float_rz(int x)
 
 __device__ float __int_as_float(int x)
 {
-  return reinterpret_cast<float&>(x);
+  static_assert(sizeof(float) == sizeof(int), "");
+
+  float tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
 }
 
 __device__ double __ll2double_rd(long long int x)
@@ -270,7 +305,12 @@ __device__ float __ll2float_rz(long long int x)
 
 __device__ double __longlong_as_double(long long int x)
 {
-  return reinterpret_cast<double&>(x);
+  static_assert(sizeof(double) == sizeof(long long), "");
+
+  double tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return x;
 }
 
 __device__ double __uint2double_rn(int x)
@@ -297,7 +337,12 @@ __device__ float __uint2float_rz(unsigned int x)
 
 __device__ float __uint_as_float(unsigned int x)
 {
-  return reinterpret_cast<float&>(x);
+  static_assert(sizeof(float) == sizeof(unsigned int), "");
+
+  float tmp;
+  __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+  return tmp;
 }
 
 __device__ double __ull2double_rd(unsigned long long int x)