Implement __shfl_* funcs into HIP headers

Этот коммит содержится в:
Aaron Enye Shi
2018-06-22 19:11:35 +00:00
родитель 8ac864c2e3
Коммит 9ac31e0bb6
3 изменённых файлов: 121 добавлений и 54 удалений
+121
Просмотреть файл
@@ -282,6 +282,127 @@ __device__ static inline int __hip_move_dpp(int src, int dpp_ctrl, int row_mask,
return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
}
static constexpr int warpSize = 64;
__device__
inline
int __shfl(int var, int src_lane, int width = warpSize) {
int self = __lane_id();
int index = src_lane + (self & ~(width-1));
return __llvm_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
__u tmp; tmp.u = var;
tmp.i = __shfl(tmp.i, src_lane, width);
return tmp.u;
}
__device__
inline
float __shfl(float var, int src_lane, int width = warpSize) {
__u tmp; tmp.f = var;
tmp.i = __shfl(tmp.i, src_lane, width);
return tmp.f;
}
__device__
inline
double __shfl(double var, int src_lane, int width = warpSize) {
__u tmp; tmp.f = (float) var;
tmp.i = __shfl(tmp.i, src_lane, width);
return (double) tmp.f;
}
__device__
inline
int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
int index = self - lane_delta;
index = (index < (self & ~(width-1)))?self:index;
return __llvm_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
__u tmp; tmp.u = var;
tmp.i = __shfl_up(tmp.i, lane_delta, width);
return tmp.u;
}
__device__
inline
float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
__u tmp; tmp.f = var;
tmp.i = __shfl_up(tmp.i, lane_delta, width);
return tmp.f;
}
__device__
inline
double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
__u tmp; tmp.f = (float) var;
tmp.i = __shfl_up(tmp.i, lane_delta, width);
return (double) tmp.f;
}
__device__
inline
int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
int index = self + lane_delta;
index = (int)((self&(width-1))+lane_delta) >= width?self:index;
return __llvm_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
__u tmp; tmp.u = var;
tmp.i = __shfl_down(tmp.i, lane_delta, width);
return tmp.u;
}
__device__
inline
float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
__u tmp; tmp.f = var;
tmp.i = __shfl_down(tmp.i, lane_delta, width);
return tmp.f;
}
__device__
inline
double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
__u tmp; tmp.f = (float) var;
tmp.i = __shfl_down(tmp.i, lane_delta, width);
return (double) tmp.f;
}
__device__
inline
int __shfl_xor(int var, int lane_mask, int width = warpSize) {
int self = __lane_id();
int index = self^lane_mask;
index = index >= ((self+width)&~(width-1))?self:index;
return __llvm_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
__u tmp; tmp.u = var;
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
return tmp.u;
}
__device__
inline
float __shfl_xor(float var, int lane_mask, int width = warpSize) {
__u tmp; tmp.f = var;
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
return tmp.f;
}
__device__
inline
double __shfl_xor(double var, int lane_mask, int width = warpSize) {
__u tmp; tmp.f = (float) var;
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
return (double) tmp.f;
}
#define MASK1 0x00ff00ff
#define MASK2 0xff00ff00
-25
Просмотреть файл
@@ -184,36 +184,11 @@ extern int HIP_TRACE_API;
#if __HCC_OR_HIP_CLANG__
// TODO - hipify-clang - change to use the function call.
//#define warpSize hc::__wavesize()
static constexpr int warpSize = 64;
// abort
__device__ void abort();
#if __HIP_ARCH_GFX701__ == 0
// warp shuffle functions
#ifdef __cplusplus
__device__ int __shfl(int input, int lane, int width = warpSize);
__device__ int __shfl_up(int input, unsigned int lane_delta, int width = warpSize);
__device__ int __shfl_down(int input, unsigned int lane_delta, int width = warpSize);
__device__ int __shfl_xor(int input, int lane_mask, int width = warpSize);
__device__ float __shfl(float input, int lane, int width = warpSize);
__device__ float __shfl_up(float input, unsigned int lane_delta, int width = warpSize);
__device__ float __shfl_down(float input, unsigned int lane_delta, int width = warpSize);
__device__ float __shfl_xor(float input, int lane_mask, int width = warpSize);
#else
__device__ int __shfl(int input, int lane, int width);
__device__ int __shfl_up(int input, unsigned int lane_delta, int width);
__device__ int __shfl_down(int input, unsigned int lane_delta, int width);
__device__ int __shfl_xor(int input, int lane_mask, int width);
__device__ float __shfl(float input, int lane, int width);
__device__ float __shfl_up(float input, unsigned int lane_delta, int width);
__device__ float __shfl_down(float input, unsigned int lane_delta, int width);
__device__ float __shfl_xor(float input, int lane_mask, int width);
#endif //__cplusplus
__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
__device__ float __hip_ds_bpermutef(int index, float src);
__device__ unsigned __hip_ds_permute(int index, unsigned src);
-29
Просмотреть файл
@@ -147,35 +147,6 @@ __device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) {
// abort
__device__ void abort() { return hc::abort(); }
// warp shuffle functions
__device__ int __shfl(int input, int lane, int width) { return hc::__shfl(input, lane, width); }
__device__ int __shfl_up(int input, unsigned int lane_delta, int width) {
return hc::__shfl_up(input, lane_delta, width);
}
__device__ int __shfl_down(int input, unsigned int lane_delta, int width) {
return hc::__shfl_down(input, lane_delta, width);
}
__device__ int __shfl_xor(int input, int lane_mask, int width) {
return hc::__shfl_xor(input, lane_mask, width);
}
__device__ float __shfl(float input, int lane, int width) { return hc::__shfl(input, lane, width); }
__device__ float __shfl_up(float input, unsigned int lane_delta, int width) {
return hc::__shfl_up(input, lane_delta, width);
}
__device__ float __shfl_down(float input, unsigned int lane_delta, int width) {
return hc::__shfl_down(input, lane_delta, width);
}
__device__ float __shfl_xor(float input, int lane_mask, int width) {
return hc::__shfl_xor(input, lane_mask, width);
}
__host__ __device__ int min(int arg1, int arg2) {
return (int)(hc::precise_math::fmin((float)arg1, (float)arg2));
}