diff --git a/projects/clr/hipamd/include/hip/hcc_detail/device_functions.h b/projects/clr/hipamd/include/hip/hcc_detail/device_functions.h index 2d5f73c153..7b0a13e83a 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/device_functions.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/device_functions.h @@ -297,9 +297,16 @@ float __shfl(float var, int src_lane, int width = warpSize) { __device__ inline double __shfl(double var, int src_lane, int width = warpSize) { - __u tmp; tmp.f = (float) var; - tmp.i = __shfl(tmp.i, src_lane, width); - return (double) tmp.f; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; } __device__ @@ -327,9 +334,16 @@ float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) { __device__ inline double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) { - __u tmp; tmp.f = (float) var; - tmp.i = __shfl_up(tmp.i, lane_delta, width); - return (double) tmp.f; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; } __device__ @@ -357,9 +371,16 @@ float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) { __device__ inline double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) { - __u tmp; tmp.f = (float) var; - tmp.i = __shfl_down(tmp.i, lane_delta, width); - return (double) tmp.f; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; } __device__ @@ -387,9 +408,16 @@ float __shfl_xor(float var, int lane_mask, int width = warpSize) { __device__ inline double __shfl_xor(double var, int lane_mask, int width = warpSize) { - __u tmp; tmp.f = (float) var; - tmp.i = __shfl_xor(tmp.i, lane_mask, width); - return (double) tmp.f; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; } #define MASK1 0x00ff00ff