Merge pull request #723 from mangupta/fix_double_shfl

Fix logic for double variants of __shfl*

[ROCm/clr commit: 05beeec84d]
Этот коммит содержится в:
Maneesh Gupta
2018-10-25 06:01:38 +05:30
коммит произвёл GitHub
родитель 42a8d09f51 b6f6734285
Коммит bd0e1c87b7
+40 -12
Просмотреть файл
@@ -297,9 +297,16 @@ float __shfl(float var, int src_lane, int width = warpSize) {
__device__
inline
double __shfl(double var, int src_lane, int width = warpSize) {
__u tmp; tmp.f = (float) var;
tmp.i = __shfl(tmp.i, src_lane, width);
return (double) tmp.f;
static_assert(sizeof(double) == 2 * sizeof(int), "");
static_assert(sizeof(double) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
@@ -327,9 +334,16 @@ float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
__device__
inline
double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
__u tmp; tmp.f = (float) var;
tmp.i = __shfl_up(tmp.i, lane_delta, width);
return (double) tmp.f;
static_assert(sizeof(double) == 2 * sizeof(int), "");
static_assert(sizeof(double) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
@@ -357,9 +371,16 @@ float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
__device__
inline
double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
__u tmp; tmp.f = (float) var;
tmp.i = __shfl_down(tmp.i, lane_delta, width);
return (double) tmp.f;
static_assert(sizeof(double) == 2 * sizeof(int), "");
static_assert(sizeof(double) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
@@ -387,9 +408,16 @@ float __shfl_xor(float var, int lane_mask, int width = warpSize) {
__device__
inline
double __shfl_xor(double var, int lane_mask, int width = warpSize) {
__u tmp; tmp.f = (float) var;
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
return (double) tmp.f;
static_assert(sizeof(double) == 2 * sizeof(int), "");
static_assert(sizeof(double) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
#define MASK1 0x00ff00ff