Merge pull request #723 from mangupta/fix_double_shfl
Fix logic for double variants of __shfl*
[ROCm/clr commit: 05beeec84d]
Этот коммит содержится в:
@@ -297,9 +297,16 @@ float __shfl(float var, int src_lane, int width = warpSize) {
|
||||
__device__
|
||||
inline
|
||||
double __shfl(double var, int src_lane, int width = warpSize) {
|
||||
__u tmp; tmp.f = (float) var;
|
||||
tmp.i = __shfl(tmp.i, src_lane, width);
|
||||
return (double) tmp.f;
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
@@ -327,9 +334,16 @@ float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
|
||||
__device__
|
||||
inline
|
||||
double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
|
||||
__u tmp; tmp.f = (float) var;
|
||||
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
||||
return (double) tmp.f;
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
@@ -357,9 +371,16 @@ float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
|
||||
__device__
|
||||
inline
|
||||
double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
|
||||
__u tmp; tmp.f = (float) var;
|
||||
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
||||
return (double) tmp.f;
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
@@ -387,9 +408,16 @@ float __shfl_xor(float var, int lane_mask, int width = warpSize) {
|
||||
__device__
|
||||
inline
|
||||
double __shfl_xor(double var, int lane_mask, int width = warpSize) {
|
||||
__u tmp; tmp.f = (float) var;
|
||||
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
||||
return (double) tmp.f;
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
#define MASK1 0x00ff00ff
|
||||
|
||||
Ссылка в новой задаче
Block a user