Merge pull request #723 from mangupta/fix_double_shfl

Fix logic for double variants of __shfl* [ROCm/clr commit: 05beeec84d]
2018-10-25 06:01:38 +05:30
@@ -297,9 +297,16 @@ float __shfl(float var, int src_lane, int width = warpSize) {
 __device__
 inline
 double __shfl(double var, int src_lane, int width = warpSize) {
-    __u tmp; tmp.f = (float) var;
-    tmp.i = __shfl(tmp.i, src_lane, width);
-    return (double) tmp.f;
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
 }

 __device__
@@ -327,9 +334,16 @@ float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
 __device__
 inline
 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
-    __u tmp; tmp.f = (float) var;
-    tmp.i = __shfl_up(tmp.i, lane_delta, width);
-    return (double) tmp.f;
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
 }

 __device__
@@ -357,9 +371,16 @@ float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
 __device__
 inline
 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
-    __u tmp; tmp.f = (float) var;
-    tmp.i = __shfl_down(tmp.i, lane_delta, width);
-    return (double) tmp.f;
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
 }

 __device__
@@ -387,9 +408,16 @@ float __shfl_xor(float var, int lane_mask, int width = warpSize) {
 __device__
 inline
 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
-    __u tmp; tmp.f = (float) var;
-    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
-    return (double) tmp.f;
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
 }

 #define MASK1 0x00ff00ff