From 9ac31e0bb65029b164e40f76af4cfabf75fa1141 Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Fri, 22 Jun 2018 19:11:35 +0000 Subject: [PATCH] Implement __shfl_* funcs into HIP headers --- include/hip/hcc_detail/device_functions.h | 121 ++++++++++++++++++++++ include/hip/hcc_detail/hip_runtime.h | 25 ----- src/device_util.cpp | 29 ------ 3 files changed, 121 insertions(+), 54 deletions(-) diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h index 8ea6632ffc..aae0706033 100644 --- a/include/hip/hcc_detail/device_functions.h +++ b/include/hip/hcc_detail/device_functions.h @@ -282,6 +282,127 @@ __device__ static inline int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl); } +static constexpr int warpSize = 64; + + __device__ +inline +int __shfl(int var, int src_lane, int width = warpSize) { + int self = __lane_id(); + int index = src_lane + (self & ~(width-1)); + return __llvm_amdgcn_ds_bpermute(index<<2, var); +} +__device__ +inline +unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) { + __u tmp; tmp.u = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.u; +} +__device__ +inline +float __shfl(float var, int src_lane, int width = warpSize) { + __u tmp; tmp.f = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.f; +} +__device__ +inline +double __shfl(double var, int src_lane, int width = warpSize) { + __u tmp; tmp.f = (float) var; + tmp.i = __shfl(tmp.i, src_lane, width); + return (double) tmp.f; +} + + __device__ +inline +int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) { + int self = __lane_id(); + int index = self - lane_delta; + index = (index < (self & ~(width-1)))?self:index; + return __llvm_amdgcn_ds_bpermute(index<<2, var); +} +__device__ +inline +unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) { + __u tmp; tmp.u = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.u; +} +__device__ +inline +float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) { + __u tmp; tmp.f = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.f; +} +__device__ +inline +double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) { + __u tmp; tmp.f = (float) var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return (double) tmp.f; +} + +__device__ +inline +int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) { + int self = __lane_id(); + int index = self + lane_delta; + index = (int)((self&(width-1))+lane_delta) >= width?self:index; + return __llvm_amdgcn_ds_bpermute(index<<2, var); +} +__device__ +inline +unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) { + __u tmp; tmp.u = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.u; +} +__device__ +inline +float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) { + __u tmp; tmp.f = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.f; +} +__device__ +inline +double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) { + __u tmp; tmp.f = (float) var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return (double) tmp.f; +} + +__device__ +inline +int __shfl_xor(int var, int lane_mask, int width = warpSize) { + int self = __lane_id(); + int index = self^lane_mask; + index = index >= ((self+width)&~(width-1))?self:index; + return __llvm_amdgcn_ds_bpermute(index<<2, var); +} +__device__ +inline +unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) { + __u tmp; tmp.u = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.u; +} +__device__ +inline +float __shfl_xor(float var, int lane_mask, int width = warpSize) { + __u tmp; tmp.f = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.f; +} +__device__ +inline +double __shfl_xor(double var, int lane_mask, int width = warpSize) { + __u tmp; tmp.f = (float) var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return (double) tmp.f; +} + #define MASK1 0x00ff00ff #define MASK2 0xff00ff00 diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 18b04daf77..8107f00a4e 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -184,36 +184,11 @@ extern int HIP_TRACE_API; #if __HCC_OR_HIP_CLANG__ -// TODO - hipify-clang - change to use the function call. -//#define warpSize hc::__wavesize() -static constexpr int warpSize = 64; - // abort __device__ void abort(); #if __HIP_ARCH_GFX701__ == 0 -// warp shuffle functions -#ifdef __cplusplus -__device__ int __shfl(int input, int lane, int width = warpSize); -__device__ int __shfl_up(int input, unsigned int lane_delta, int width = warpSize); -__device__ int __shfl_down(int input, unsigned int lane_delta, int width = warpSize); -__device__ int __shfl_xor(int input, int lane_mask, int width = warpSize); -__device__ float __shfl(float input, int lane, int width = warpSize); -__device__ float __shfl_up(float input, unsigned int lane_delta, int width = warpSize); -__device__ float __shfl_down(float input, unsigned int lane_delta, int width = warpSize); -__device__ float __shfl_xor(float input, int lane_mask, int width = warpSize); -#else -__device__ int __shfl(int input, int lane, int width); -__device__ int __shfl_up(int input, unsigned int lane_delta, int width); -__device__ int __shfl_down(int input, unsigned int lane_delta, int width); -__device__ int __shfl_xor(int input, int lane_mask, int width); -__device__ float __shfl(float input, int lane, int width); -__device__ float __shfl_up(float input, unsigned int lane_delta, int width); -__device__ float __shfl_down(float input, unsigned int lane_delta, int width); -__device__ float __shfl_xor(float input, int lane_mask, int width); -#endif //__cplusplus - __device__ unsigned __hip_ds_bpermute(int index, unsigned src); __device__ float __hip_ds_bpermutef(int index, float src); __device__ unsigned __hip_ds_permute(int index, unsigned src); diff --git a/src/device_util.cpp b/src/device_util.cpp index 853ca71c09..65ee5f4368 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -147,35 +147,6 @@ __device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) { // abort __device__ void abort() { return hc::abort(); } -// warp shuffle functions -__device__ int __shfl(int input, int lane, int width) { return hc::__shfl(input, lane, width); } - -__device__ int __shfl_up(int input, unsigned int lane_delta, int width) { - return hc::__shfl_up(input, lane_delta, width); -} - -__device__ int __shfl_down(int input, unsigned int lane_delta, int width) { - return hc::__shfl_down(input, lane_delta, width); -} - -__device__ int __shfl_xor(int input, int lane_mask, int width) { - return hc::__shfl_xor(input, lane_mask, width); -} - -__device__ float __shfl(float input, int lane, int width) { return hc::__shfl(input, lane, width); } - -__device__ float __shfl_up(float input, unsigned int lane_delta, int width) { - return hc::__shfl_up(input, lane_delta, width); -} - -__device__ float __shfl_down(float input, unsigned int lane_delta, int width) { - return hc::__shfl_down(input, lane_delta, width); -} - -__device__ float __shfl_xor(float input, int lane_mask, int width) { - return hc::__shfl_xor(input, lane_mask, width); -} - __host__ __device__ int min(int arg1, int arg2) { return (int)(hc::precise_math::fmin((float)arg1, (float)arg2)); }