From d9ecde1b60eed53e687d305cd4e6e02bef3c85a7 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 17 Nov 2017 16:00:28 +0000 Subject: [PATCH 1/2] This fixes some outright quaint choices made when implementing HIP's bitwise conversion functions, by using simple reinterpret_casts, as is idiomatic. These functions are supposed to be re-entrant, correct and efficient. Sadly, they were neither: they hid a massive race condition against a value stored in global memory, which means that they were also unreasonably slow if they ever managed to be correct, and relied on union based type punning which is in a grey area of the standard. It is difficult to ascertain what may have been the reason for coming up with this quirky solution. [ROCm/hip commit: 153878e368d4e2966bd5c2524e1fd8c3e1b686ff] --- projects/hip/src/device_functions.cpp | 53 ++++++--------------------- 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/projects/hip/src/device_functions.cpp b/projects/hip/src/device_functions.cpp index 615ae4d0b7..a66cc1e9fb 100644 --- a/projects/hip/src/device_functions.cpp +++ b/projects/hip/src/device_functions.cpp @@ -23,27 +23,6 @@ THE SOFTWARE. #include #include "device_util.h" -struct holder64Bit{ - union{ - double d; - unsigned long int uli; - signed long int sli; - signed int si[2]; - unsigned int ui[2]; - }; -} __attribute__((aligned(8))); - -struct holder32Bit { - union { - float f; - unsigned int ui; - signed int si; - }; -} __attribute__((aligned(4))); - -__device__ struct holder64Bit hold64; -__device__ struct holder32Bit hold32; - __device__ float __double2float_rd(double x) { return (double)x; @@ -64,13 +43,11 @@ __device__ float __double2float_rz(double x) __device__ int __double2hiint(double x) { - hold64.d = x; - return hold64.si[1]; + return reinterpret_cast(x)[1]; } __device__ int __double2loint(double x) { - hold64.d = x; - return hold64.si[0]; + return reinterpret_cast(x)[0]; } @@ -145,8 +122,7 @@ __device__ unsigned long long int __double2ull_rz(double x) __device__ long long int __double_as_longlong(double x) { - hold64.d = x; - return hold64.sli; + return reinterpret_cast(x); } __device__ int __float2int_rd(float x) @@ -219,19 +195,17 @@ __device__ unsigned long long int __float2ull_rz(float x) __device__ int __float_as_int(float x) { - hold32.f = x; - return hold32.si; + return reinterpret_cast(x); } __device__ unsigned int __float_as_uint(float x) { - hold32.f = x; - return hold32.ui; + return reinterpret_cast(x); } __device__ double __hiloint2double(int hi, int lo) -{ - hold64.si[1] = hi; - hold64.si[0] = lo; - return hold64.d; +{ // TODO: this matches the original in not considering endianness, is that + // correct though? + int tmp[] = {lo, hi}; + return reinterpret_cast(tmp); } __device__ double __int2double_rn(int x) { @@ -257,8 +231,7 @@ __device__ float __int2float_rz(int x) __device__ float __int_as_float(int x) { - hold32.si = x; - return hold32.f; + return reinterpret_cast(x); } __device__ double __ll2double_rd(long long int x) @@ -297,8 +270,7 @@ __device__ float __ll2float_rz(long long int x) __device__ double __longlong_as_double(long long int x) { - hold64.sli = x; - return hold64.d; + return reinterpret_cast(x); } __device__ double __uint2double_rn(int x) @@ -325,8 +297,7 @@ __device__ float __uint2float_rz(unsigned int x) __device__ float __uint_as_float(unsigned int x) { - hold32.ui = x; - return hold32.f; + return reinterpret_cast(x); } __device__ double __ull2double_rd(unsigned long long int x) From 6de4a217c0725a0129913c37a05f4a79f850a70d Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 18 Nov 2017 01:16:31 +0000 Subject: [PATCH 2/2] This actually (tries) to do the right thing all the way, by using memcpy for bitcasting, and not rely on undefined behaviour of a different flavour as a substitute for the original undefined behaviour. Note that the compiler will (should) optimise down to the same emitted code, since this is a pattern it understands. [ROCm/hip commit: 6fa7adf077870ccc40aaf606a941d6fc938207cf] --- projects/hip/src/device_functions.cpp | 71 ++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/projects/hip/src/device_functions.cpp b/projects/hip/src/device_functions.cpp index a66cc1e9fb..63425bc9f4 100644 --- a/projects/hip/src/device_functions.cpp +++ b/projects/hip/src/device_functions.cpp @@ -43,11 +43,21 @@ __device__ float __double2float_rz(double x) __device__ int __double2hiint(double x) { - return reinterpret_cast(x)[1]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[1]; } __device__ int __double2loint(double x) { - return reinterpret_cast(x)[0]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[0]; } @@ -122,7 +132,12 @@ __device__ unsigned long long int __double2ull_rz(double x) __device__ long long int __double_as_longlong(double x) { - return reinterpret_cast(x); + static_assert(sizeof(long long) == sizeof(double), ""); + + long long tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ int __float2int_rd(float x) @@ -195,17 +210,32 @@ __device__ unsigned long long int __float2ull_rz(float x) __device__ int __float_as_int(float x) { - return reinterpret_cast(x); + static_assert(sizeof(int) == sizeof(float), ""); + + int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ unsigned int __float_as_uint(float x) { - return reinterpret_cast(x); + static_assert(sizeof(unsigned int) == sizeof(float), ""); + + unsigned int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } -__device__ double __hiloint2double(int hi, int lo) -{ // TODO: this matches the original in not considering endianness, is that - // correct though? - int tmp[] = {lo, hi}; - return reinterpret_cast(tmp); +__device__ double __hiloint2double(int32_t hi, int32_t lo) +{ + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + uint64_t tmp0 = + (static_cast(hi) << 32ull) | static_cast(lo); + double tmp1; + __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + + return tmp1; } __device__ double __int2double_rn(int x) { @@ -231,7 +261,12 @@ __device__ float __int2float_rz(int x) __device__ float __int_as_float(int x) { - return reinterpret_cast(x); + static_assert(sizeof(float) == sizeof(int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ll2double_rd(long long int x) @@ -270,7 +305,12 @@ __device__ float __ll2float_rz(long long int x) __device__ double __longlong_as_double(long long int x) { - return reinterpret_cast(x); + static_assert(sizeof(double) == sizeof(long long), ""); + + double tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return x; } __device__ double __uint2double_rn(int x) @@ -297,7 +337,12 @@ __device__ float __uint2float_rz(unsigned int x) __device__ float __uint_as_float(unsigned int x) { - return reinterpret_cast(x); + static_assert(sizeof(float) == sizeof(unsigned int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ull2double_rd(unsigned long long int x)